Load the necessary libraries
library(car) #for regression diagnostics
library(broom) #for tidy output
library(ggfortify) #for model diagnostics
library(sjPlot) #for outputs
library(knitr) #for kable
library(effects) #for partial effects plots
library(emmeans) #for estimating marginal means
library(ggeffects) #for partial effects plots
library(MASS) #for glm.nb
library(MuMIn) #for AICc
library(DHARMa) #for residual diagnostics plots
library(modelr) #for auxillary modelling functions
library(performance) #for residuals diagnostics
library(see) #for plotting residuals
library(patchwork) #for grids of plots
library(tidyverse) #for data wrangling
theme_set(theme_classic())
Loyn (1987) modeled the abundance of forest birds with six predictor variables (patch area, distance to nearest patch, distance to nearest larger patch, grazing intensity, altitude and years since the patch had been isolated).
Regent honeyeater
Format of loyn.csv data file
| abund | dist | ldist | area | graze | alt | yr_isol |
|---|---|---|---|---|---|---|
| .. | .. | .. | .. | .. | .. | .. |
| abund | Abundance of forest birds in patch- response variable |
| dist | Distance to nearest patch - predictor variable |
| ldist | Distance to nearest larger patch - predictor variable |
| area | Size of the patch - predictor variable |
| graze | Grazing intensity (1 to 5, representing light to heavy) - predictor variable |
| alt | Altitude - predictor variable |
| yr_isol | Number of years since the patch was isolated - predictor variable |
The aim of the analysis is to investigate the effects of a range of predictors on the abundance of forest birds.
loyn = read_csv('../data/loyn.csv', trim_ws=TRUE) %>%
janitor::clean_names()
## Parsed with column specification:
## cols(
## ABUND = col_double(),
## AREA = col_double(),
## YR.ISOL = col_double(),
## DIST = col_double(),
## LDIST = col_double(),
## GRAZE = col_double(),
## ALT = col_double()
## )
glimpse(loyn)
## Rows: 56
## Columns: 7
## $ abund <dbl> 5.3, 2.0, 1.5, 17.1, 13.8, 14.1, 3.8, 2.2, 3.3, 3.0, 27.6, 1.…
## $ area <dbl> 0.1, 0.5, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2…
## $ yr_isol <dbl> 1968, 1920, 1900, 1966, 1918, 1965, 1955, 1920, 1965, 1900, 1…
## $ dist <dbl> 39, 234, 104, 66, 246, 234, 467, 284, 156, 311, 66, 93, 39, 4…
## $ ldist <dbl> 39, 234, 311, 66, 246, 285, 467, 1829, 156, 571, 332, 93, 39,…
## $ graze <dbl> 2, 5, 5, 3, 5, 3, 5, 5, 4, 5, 3, 5, 2, 1, 5, 5, 3, 3, 3, 2, 2…
## $ alt <dbl> 160, 60, 140, 160, 140, 130, 90, 60, 130, 130, 210, 160, 210,…
Avoid multicollinearity (correlated variables CAN be included to soak up the variance but CANNOT be interpreted while other correlated variables are there!)
# pairs(loyn)
scatterplotMatrix(~abund + dist + ldist + area + graze + alt + yr_isol,
data=loyn, diagonal = list(method = 'boxplot'),
regLine = list(col="red"))
glm(abund ~ ., data = loyn) %>% vif
## area yr_isol dist ldist graze alt
## 1.337627 1.841657 1.227387 1.255028 2.307661 1.574537
Predictors must be transformed, as the family relates only to the response. Log area in particular is particularly skewed.
scatterplotMatrix(~abund + log(dist) + log(ldist) + log(area) +
graze + alt + yr_isol,
data=loyn, diagonal = list(method = 'boxplot'),
smooth=list(col.spread="green"),
regLine = list(col="red"))
glm(abund ~ log(dist) + log(ldist) + log(area) +
graze + alt + yr_isol, data = loyn) %>% vif
## log(dist) log(ldist) log(area) graze alt yr_isol
## 1.654553 2.009749 1.911514 2.524814 1.467937 1.804769
# loyn <- loyn %>%
# mutate(larea = log(area)) %>%
# dplyr::select(-area)
Graze has a high VIF, consider deleting. Also, should maybe be ordinal/categorical instead.
BIG NOTE: scale=F does not work with emmeans, as they are trying to get everyone to write FALSE rather than F for short!!
loyn <- loyn %>%
mutate(fgraze = as.factor(graze),
l_dist = log(dist),
l_ldist = log(ldist),
l_area = log(area),
sl_dist = scale(l_dist, scale=F),
sl_ldist = scale(l_ldist, scale=F),
sl_area = scale(l_area),
salt = scale(alt, scale=F),
syr_isol = scale(yr_isol, scale=F)
)
loyn.glm <- glm(abund ~ scale(l_dist, scale=F) +
scale(l_ldist, scale=F) + scale(l_area, scale=F) +
fgraze + scale(alt, scale=F) + scale(yr_isol, scale=F),
data = loyn,
family = gaussian(link = "identity"))
vif(loyn.glm) # none above 3
## GVIF Df GVIF^(1/(2*Df))
## scale(l_dist, scale = F) 1.907874 1 1.381258
## scale(l_ldist, scale = F) 2.228456 1 1.492801
## scale(l_area, scale = F) 2.201116 1 1.483616
## fgraze 7.925036 4 1.295314
## scale(alt, scale = F) 1.597400 1 1.263883
## scale(yr_isol, scale = F) 3.252591 1 1.803494
The rule is we don’t want values above 3 for VIF, otherwise, that variable is correlated to the others in a way that obscures its interpretation.
There’s also a function to easily compare scaled effect sizes of all variables rather than do so here. That way, we can still interpret the function with the appropriate units here, but also check effect sizes later on as well.
Model formula: \[ y_i \sim{} \mathcal{N}(\mu_i, \sigma^2)\\ log(\mu_i) = \boldsymbol{\beta} \bf{X_i} \]
where \(\boldsymbol{\beta}\) is a vector of effects parameters and \(\bf{X}\) is a model matrix representing the additive effects of the scaled versions of distance (ln), distance to the nearest large patch (ln), patch area (ln), grazing intensity, year of isolation and altitude on the abundance of forest birds.
autoplot(loyn.glm, which = 1:6)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Residuals are ok, normality not great, but should be robust to this. Cook’s d are all low (less than 0.8).
loyn.resid <- simulateResiduals(loyn.glm, plot=T)
Homoscedasticity is ok too.
When you have multiple variables, need to also plot the residuals against each predictor variable too.
loyn$resid <- loyn.resid$scaledResiduals
loyn %>%
ggplot(aes(x = l_dist, y = resid)) +
geom_point() +
geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
loyn %>%
ggplot(aes(x = l_ldist, y = resid)) +
geom_point() +
geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
loyn %>%
ggplot(aes(x = l_area, y = resid)) +
geom_point() +
geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
loyn %>%
ggplot(aes(x = alt, y = resid)) +
geom_point() +
geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
loyn %>%
ggplot(aes(x = yr_isol, y = resid)) +
geom_point() +
geom_smooth(se=F)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Didn't have enough time to finish this last plot
# loyn %>%
#
# group_by(fgraze) %>%
# summarise(mean = mean(resid),
# se = sd(resid) / sqrt( n() ),
# upr = gmodels::ci(resid)[3]) %>%
# ggplot(aes(x = fgraze, y = mean)) +
# geom_bar() +
# geom_pointrange(aes(ymin = lwr, ymax = upr))
loyn.glmb <- glm(abund ~ sl_dist + sl_ldist + sl_area + fgraze +
salt + syr_isol, data = loyn,
family = gaussian(link = "identity"))
loyn.glmb %>% ggemmeans(~sl_dist) %>% plot(add=T)
loyn.glmb %>% ggemmeans(~sl_ldist) %>% plot(add=T)
loyn.glmb %>% ggemmeans(~salt) %>% plot(add=T)
loyn.glmb %>% ggemmeans(~syr_isol) %>% plot(add=T)
loyn.glmb %>% ggemmeans(~sl_area + fgraze) %>% plot(add=T)
plot_model(loyn.glmb, type = 'eff', show.data = T, dot.size = 0.5) %>%
plot_grid
## Warning in plot_grid(.): Not enough tags labels in list. Using letters instead.
Note that if we had used log() and scale() in the glm function, plot_model would do the appropriate back-transform to the original scale, amazingly!
log-area is clearly important, as well as graze = 5.
plot(allEffects(loyn.glmb, residuals = T), type = "response")
## Warning in Analyze.model(focal.predictors, mod, xlevels, default.levels, : the
## predictors sl_dist, sl_ldist, sl_area, salt, syr_isol are one-column matrices
## that were converted to vectors
## Warning in Analyze.model(focal.predictors, mod, xlevels, default.levels, : the
## predictors sl_dist, sl_ldist, sl_area, salt, syr_isol are one-column matrices
## that were converted to vectors
## Warning in Analyze.model(focal.predictors, mod, xlevels, default.levels, : the
## predictors sl_dist, sl_ldist, sl_area, salt, syr_isol are one-column matrices
## that were converted to vectors
## Warning in Analyze.model(focal.predictors, mod, xlevels, default.levels, : the
## predictors sl_dist, sl_ldist, sl_area, salt, syr_isol are one-column matrices
## that were converted to vectors
## Warning in Analyze.model(focal.predictors, mod, xlevels, default.levels, : the
## predictors sl_dist, sl_ldist, sl_area, salt, syr_isol are one-column matrices
## that were converted to vectors
## Warning in Analyze.model(focal.predictors, mod, xlevels, default.levels, : the
## predictors sl_dist, sl_ldist, sl_area, salt, syr_isol are one-column matrices
## that were converted to vectors
plot_model(loyn.glm, type='est', transform='exp', show.values=TRUE)
summary(loyn.glm)
##
## Call:
## glm(formula = abund ~ scale(l_dist, scale = F) + scale(l_ldist,
## scale = F) + scale(l_area, scale = F) + fgraze + scale(alt,
## scale = F) + scale(yr_isol, scale = F), family = gaussian(link = "identity"),
## data = loyn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -15.8992 -2.7245 -0.2772 2.7052 11.2811
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.47274 2.35785 9.531 1.83e-12 ***
## scale(l_dist, scale = F) 0.14456 1.19334 0.121 0.9041
## scale(l_ldist, scale = F) 0.34641 0.92835 0.373 0.7107
## scale(l_area, scale = F) 2.96755 0.65287 4.545 3.97e-05 ***
## fgraze2 0.52851 3.25221 0.163 0.8716
## fgraze3 0.06601 2.95871 0.022 0.9823
## fgraze4 -1.24877 3.19838 -0.390 0.6980
## fgraze5 -12.47309 4.77827 -2.610 0.0122 *
## scale(alt, scale = F) 0.01070 0.02390 0.448 0.6565
## scale(yr_isol, scale = F) -0.01277 0.05803 -0.220 0.8267
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 37.27021)
##
## Null deviance: 6337.9 on 55 degrees of freedom
## Residual deviance: 1714.4 on 46 degrees of freedom
## AIC: 372.52
##
## Number of Fisher Scoring iterations: 2
Evidence of an effect of log-area and of grazing area 5 being less than grazing 1, while all other grazing levels were not different from grazing 1. Need to investigate pair-wise effects among the factor levels thereafter.
tidy(loyn.glm, confint=T)
glance(loyn.glm)
MuMIn::r.squaredLR(loyn.glm)
## [1] 0.7294968
## attr(,"adj.r.squared")
## [1] 0.7298744
augment(loyn.glm)
How to look at standardized effect sizes after the model is already run:
std_param_est <- MuMIn::std.coef(loyn.glm, partial.sd=T) %>%
as.data.frame %>%
rownames_to_column("parameter") %>%
janitor::clean_names() %>%
arrange(-abs(estimate))
Note: interactions between continuous variables REQUIRE them to be centered first
Can do dredging for different interactions:
loyn.glm3 <- glm(abund ~ (sl_area + salt + sl_dist + sl_ldist + fgraze +
syr_isol)^2, data = loyn,
family = gaussian(link = "identity"),
na.action = na.fail)
# dredge(loyn.glm3, rank = "AICc") %>% head # runs forever!
loyn.glm <- update(loyn.glm, na.action = na.fail)
dredge(loyn.glm, rank = "AICc")
## Fixed term is "(Intercept)"
Best model is indicated by ‘+’ for fgraze being important, as well as log-area (coef = 3.147).
dAICc = 2 is that the two models are basically the same. If more than two, are basically different. Thus, this model is the optimal model.
Multiply all coefficients by weight, assuming that the models are reasonable (i.e. deltaAIC = 10 or less).
loyn.av <- model.avg(dredge(loyn.glm, rank = "AICc"),
subset = (delta <=10) )
## Fixed term is "(Intercept)"
## Fixed term is "(Intercept)"
summary(loyn.av)
##
## Call:
## model.avg(object = dredge(loyn.glm, rank = "AICc"), subset = (delta <=
## 10))
##
## Component model call:
## glm(formula = abund ~ <21 unique rhs>, family = gaussian(link =
## "identity"), data = loyn, na.action = na.fail)
##
## Component models:
## df logLik AICc delta weight
## 13 7 -175.52 367.38 0.00 0.37
## 123 8 -175.43 369.93 2.56 0.10
## 135 8 -175.46 369.98 2.60 0.10
## 136 8 -175.47 370.00 2.62 0.10
## 134 8 -175.50 370.06 2.68 0.10
## 1235 9 -175.30 372.51 5.13 0.03
## 1234 9 -175.37 372.66 5.29 0.03
## 1356 9 -175.39 372.69 5.31 0.03
## 1236 9 -175.41 372.73 5.36 0.03
## 1346 9 -175.43 372.78 5.40 0.03
## 1345 9 -175.46 372.83 5.45 0.02
## 236 5 -181.43 374.06 6.68 0.01
## 36 4 -182.99 374.77 7.39 0.01
## 12356 10 -175.27 375.43 8.05 0.01
## 12345 10 -175.29 375.47 8.10 0.01
## 12346 10 -175.35 375.58 8.21 0.01
## 13456 10 -175.38 375.66 8.28 0.01
## 356 5 -182.36 375.92 8.54 0.01
## 2356 6 -181.32 376.35 8.97 0.00
## 2346 6 -181.38 376.47 9.10 0.00
## 346 5 -182.64 376.48 9.10 0.00
##
## Term codes:
## fgraze scale(alt, scale = F) scale(l_area, scale = F)
## 1 2 3
## scale(l_dist, scale = F) scale(l_ldist, scale = F) scale(yr_isol, scale = F)
## 4 5 6
##
## Model-averaged coefficients:
## (full average)
## Estimate Std. Error Adjusted SE z value Pr(>|z|)
## (Intercept) 22.360879 2.076214 2.124400 10.526 < 2e-16
## fgraze2 0.408791 2.922781 2.996427 0.136 0.89148
## fgraze3 -0.137746 2.580701 2.645688 0.052 0.95848
## fgraze4 -1.460970 2.981305 3.055740 0.478 0.63257
## fgraze5 -11.568196 4.031385 4.097761 2.823 0.00476
## scale(l_area, scale = F) 3.136494 0.579214 0.593126 5.288 1e-07
## scale(alt, scale = F) 0.002593 0.011973 0.012201 0.212 0.83173
## scale(l_ldist, scale = F) 0.051416 0.390263 0.399061 0.129 0.89748
## scale(yr_isol, scale = F) 0.001858 0.036612 0.037063 0.050 0.96002
## scale(l_dist, scale = F) 0.034661 0.476295 0.487911 0.071 0.94337
##
## (Intercept) ***
## fgraze2
## fgraze3
## fgraze4
## fgraze5 **
## scale(l_area, scale = F) ***
## scale(alt, scale = F)
## scale(l_ldist, scale = F)
## scale(yr_isol, scale = F)
## scale(l_dist, scale = F)
##
## (conditional average)
## Estimate Std. Error Adjusted SE z value Pr(>|z|)
## (Intercept) 22.360879 2.076214 2.124400 10.526 < 2e-16
## fgraze2 0.425760 2.981616 3.056804 0.139 0.889227
## fgraze3 -0.143464 2.633563 2.699889 0.053 0.957623
## fgraze4 -1.521615 3.027351 3.103687 0.490 0.623949
## fgraze5 -12.048392 3.337828 3.420975 3.522 0.000428
## scale(l_area, scale = F) 3.136494 0.579214 0.593126 5.288 1e-07
## scale(alt, scale = F) 0.011481 0.023082 0.023605 0.486 0.626694
## scale(l_ldist, scale = F) 0.245025 0.823641 0.843492 0.290 0.771443
## scale(yr_isol, scale = F) 0.007858 0.074986 0.075919 0.104 0.917559
## scale(l_dist, scale = F) 0.172973 1.052708 1.078930 0.160 0.872630
##
## (Intercept) ***
## fgraze2
## fgraze3
## fgraze4
## fgraze5 ***
## scale(l_area, scale = F) ***
## scale(alt, scale = F)
## scale(l_ldist, scale = F)
## scale(yr_isol, scale = F)
## scale(l_dist, scale = F)
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Conditional average is that it does not include NA values, so it is biased away from zero (obvious only for weaker variable effects).
Pre-register models to explain the variables
loyn.glm1 <- update(loyn.glm, . ~
scale(l_dist, scale=FALSE) * scale(l_ldist, scale=FALSE))
loyn.glm2 <- update(loyn.glm, . ~
scale(l_area, scale=FALSE) * fgraze)
loyn.glm3 <- update(loyn.glm, . ~
scale(l_area, scale=FALSE) * fgraze * scale(yr_isol, scale=FALSE))
loyn.glm4 <- update(loyn.glm, . ~
scale(alt, scale=FALSE))
loyn.null <- update(loyn.glm, . ~ 1)
AICc(loyn.glm1, loyn.glm2, loyn.glm3, loyn.glm4, loyn.null) %>% arrange(AICc)
tidy(loyn.glm2)
glance(loyn.glm2)
loyn.glm2b <- update(loyn.glm, . ~ sl_area * fgraze)
loyn.glm2b.s <- emmeans(loyn.glm2b, "fgraze", data=loyn)
## NOTE: Results may be misleading due to involvement in interactions
pairs(loyn.glm2b)
emtrends(loyn.glm2b, pairwise~fgraze, var = 'sl_area')
## $emtrends
## fgraze sl_area.trend SE df asymp.LCL asymp.UCL
## 1 3.37 1.67 Inf 0.0909 6.64
## 2 6.92 1.89 Inf 3.2058 10.63
## 3 7.42 2.33 Inf 2.8505 11.99
## 4 15.74 4.51 Inf 6.8972 24.59
## 5 4.99 2.45 Inf 0.1879 9.79
##
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df z.ratio p.value
## 1 - 2 -3.549 2.53 Inf -1.405 0.6241
## 1 - 3 -4.054 2.87 Inf -1.413 0.6192
## 1 - 4 -12.377 4.81 Inf -2.572 0.0757
## 1 - 5 -1.621 2.96 Inf -0.547 0.9824
## 2 - 3 -0.505 3.00 Inf -0.168 0.9998
## 2 - 4 -8.828 4.89 Inf -1.804 0.3713
## 2 - 5 1.928 3.10 Inf 0.623 0.9715
## 3 - 4 -8.324 5.08 Inf -1.638 0.4727
## 3 - 5 2.432 3.38 Inf 0.719 0.9521
## 4 - 5 10.756 5.14 Inf 2.095 0.2223
##
## P value adjustment: tukey method for comparing a family of 5 estimates
loyn.grid <- with(loyn,
list(fgraze = levels(fgraze),
sl_area = seq_range(sl_area, n=100)))
newdata <- emmeans(loyn.glm2b, ~ sl_area|fgraze, at = loyn.grid) %>% as.data.frame %>%
rename(abund = emmean, lwr = asymp.LCL, upr = asymp.UCL)
newdata %>%
ggplot(aes(x = sl_area, y = abund, fill = fgraze)) +
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2) +
geom_line(aes(col = fgraze)) +
geom_point(data = loyn, aes(col = fgraze)) +
coord_trans(x = "exp")
## Warning in trans$inverse(continuous_range_coord): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
## Warning in self$trans$x$inverse(panel_params$x.range): NaNs produced
However, this plot is incorrect, as it is not on the correct response scale. To do this, we can refit our model with scale(scale=FALSE) and it should remember what our original variables were.
newdata %>%
ggplot(aes(x = sl_area, y = abund, fill = fgraze)) +
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2) +
geom_line(aes(col = fgraze)) +
geom_point(data = loyn, aes(col = fgraze))
# coord_trans(x = "exp")
To avoid negative values, update to Gamma family
loyn.glm2a <- glm(abund ~ scale(log(area), scale=FALSE) * fgraze,
data = loyn, family = Gamma(link = "log"))
loyn.grid <- with(loyn,
list(fgraze = levels(fgraze),
area = seq_range(area, n=100)))
newdata <- emmeans(loyn.glm2a, ~area|fgraze, at=loyn.grid, type = 'response') %>%
as.data.frame %>% rename(abund = response, lwr = asymp.LCL, upr = asymp.UCL)
newdata %>%
ggplot(aes(x = area, y = abund, fill = fgraze)) +
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2) +
geom_line(aes(col = fgraze)) +
geom_point(data = loyn, aes(col = fgraze)) +
scale_x_log10(labels = scales::comma) +
scale_y_log10() +
labs(x = expression(Area~(m^2)), y = "Forest bird abundance")